{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generating and Writing Data to GCS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import multiprocessing as mp\n",
    "import random\n",
    "\n",
    "# You need to update these to your real paths!\n",
    "dataRoot = os.getenv(\"DATA_ROOT\", '/path/to/your/datasets')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We define the generate_data function which takes an integer i as input and generates sales data using random numbers. The generated data includes sales ID, product name, price, quantity sold, date of sale, and customer ID. The function returns a tuple of the generated data.\n",
    "def generate_data(i):\n",
    "    sales_id = \"s_{}\".format(i)\n",
    "    product_name = \"Product_{}\".format(i)\n",
    "    price = random.uniform(1,10)\n",
    "    quantity_sold = random.randint(1,10)\n",
    "    date_of_sale = \"2022-{}-{}\".format(random.randint(1,12), random.randint(1,28))\n",
    "    customer_id = \"c_{}\".format(random.randint(1,10000))\n",
    "    return (sales_id, product_name, price, quantity_sold, date_of_sale, customer_id)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    sales_data = p.map(generate_data, range(1000000))\n",
    "    sales_data = list(sales_data)\n",
    "    \n",
    "print(\"write to gcs started\")\n",
    "sales_df = pd.DataFrame(sales_data, columns=[\"sales_id\", \"product_name\", \"price\", \"quantity_sold\", \"date_of_sale\", \"customer_id\"])\n",
    "os.makedirs(dataRoot+\"/sales/\", exist_ok=True)\n",
    "sales_df.to_csv(dataRoot+\"/sales/data.csv\", index=False, header=True)\n",
    "print(\"Write to gcs completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(i):\n",
    "    product_name = \"Product_{}\".format(i)\n",
    "    shelf_life = random.randint(1,365)\n",
    "    contains_promotion = \"{} % off\".format(random.randint(0,10))\n",
    "    quantity_in_stock = random.randint(1,100)\n",
    "    location = \"Location_{}\".format(random.randint(1,100))\n",
    "    date_received = \"2022-{}-{}\".format(random.randint(1,12), random.randint(1,28))\n",
    "    return (product_name,shelf_life,contains_promotion,quantity_in_stock, location, date_received)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    stock_data = p.map(generate_data, range(50000))\n",
    "    stock_data = list(stock_data)\n",
    "    \n",
    "stock_df = pd.DataFrame(stock_data,  columns=[\"product_name\",\"shelf_life\",\"contains_promotion\",\"quantity_in_stock\", \"location\", \"date_received\"])\n",
    "os.makedirs(dataRoot+\"/stock/\", exist_ok=True)\n",
    "stock_df.to_json(dataRoot+\"/stock/stock.json\", orient='records')\n",
    "print(\"Write to gcs completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(i):\n",
    "    sup_id = \"s_{}\".format(i)\n",
    "    product_name = \"Product_{}\".format(i)\n",
    "    quantity_ordered = random.randint(1,100)\n",
    "    price = random.uniform(1,10)\n",
    "    date_ordered = \"2022-{}-{}\".format(random.randint(1,12), random.randint(1,28))\n",
    "    return (sup_id,product_name, quantity_ordered, price, date_ordered)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    supplier_data = p.map(generate_data, range(50000))\n",
    "    supplier_data = list(supplier_data)\n",
    "    \n",
    "supplier_df = pd.DataFrame(supplier_data,  columns=[\"sup_id\",\"product_name\", \"quantity_ordered\", \"price\", \"date_ordered\"])\n",
    "os.makedirs(dataRoot+\"/supplier/\", exist_ok=True)\n",
    "supplier_df.to_json(dataRoot+\"/supplier/supplier.json\", orient='records')\n",
    "print(\"Write to gcs completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(i):\n",
    "    customer_id = \"c_{}\".format(i)\n",
    "    customer_name = \"Customer_{}\".format(i)\n",
    "    age = random.randint(20,70)\n",
    "    gender = random.choice([\"male\", \"female\"])\n",
    "    purchase_history = random.randint(1,100)\n",
    "    contact_info = \"email_{}@gmail.com\".format(i)\n",
    "    return (customer_id,customer_name, age, gender, purchase_history, contact_info)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    customer_data = p.map(generate_data, range(1000))\n",
    "    customer_data = list(customer_data)\n",
    "    \n",
    "customer_df = pd.DataFrame(customer_data,  columns=[\"customer_id\",\"customer_name\", \"age\", \"gender\", \"purchase_history\", \"contact_info\"])\n",
    "os.makedirs(dataRoot+\"/customer/\", exist_ok=True)\n",
    "customer_df.to_csv(dataRoot+\"/customer/customer.csv\", index=False,header=True)\n",
    "print(\"Write to gcs completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(i):\n",
    "    product_name = \"Product_{}\".format(i)\n",
    "    competitor_price = random.uniform(1,100)\n",
    "    sales_trend = random.randint(1,100)\n",
    "    demand_forecast = random.randint(1,100)\n",
    "    return (product_name, competitor_price, sales_trend, demand_forecast)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    market_data = p.map(generate_data, range(500000))\n",
    "    market_data = list(market_data)\n",
    "    \n",
    "market_df = pd.DataFrame(market_data,  columns=[\"product_name\", \"competitor_price\", \"sales_trend\", \"demand_forecast\"])\n",
    "os.makedirs(dataRoot+\"/market/\", exist_ok=True)\n",
    "market_df.to_csv(dataRoot+\"/market/market.csv\", index=False,header=True)\n",
    "print(\"Write to gcs completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data(i):\n",
    "    product_name = \"Product_{}\".format(i)\n",
    "    shipping_cost = random.uniform(1,10)\n",
    "    transportation_cost = random.uniform(1,10)\n",
    "    warehouse_cost = random.uniform(1,10)\n",
    "    return (product_name, shipping_cost, transportation_cost, warehouse_cost)\n",
    "\n",
    "with mp.Pool(mp.cpu_count()) as p:\n",
    "    logistic_data = p.map(generate_data, range(500000))\n",
    "    logistic_data = list(logistic_data)\n",
    "    \n",
    "logistic_df = pd.DataFrame(logistic_data,  columns=[\"product_name\", \"shipping_cost\", \"transportation_cost\", \"warehouse_cost\"])\n",
    "os.makedirs(dataRoot+\"/logistic/\", exist_ok=True)\n",
    "logistic_df.to_csv(dataRoot+\"/logistic/logistic.csv\", index=False,header=True)\n",
    "print(\"Write to gcs completed\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
